; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+amx-int8 -mattr=+avx512f -verify-machineinstrs | FileCheck %s
@buf = dso_local global [3072 x i8] zeroinitializer, align 16

define dso_local void @test1(i16 signext %0, i16 signext %1) nounwind {
; CHECK-LABEL: test1:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vxorps %xmm0, %xmm0, %xmm0
; CHECK-NEXT:    vmovups %zmm0, -{{[0-9]+}}(%rsp)
; CHECK-NEXT:    movb $1, -{{[0-9]+}}(%rsp)
; CHECK-NEXT:    movw $8, -{{[0-9]+}}(%rsp)
; CHECK-NEXT:    movb $8, -{{[0-9]+}}(%rsp)
; CHECK-NEXT:    movw %si, -{{[0-9]+}}(%rsp)
; CHECK-NEXT:    movb %dil, -{{[0-9]+}}(%rsp)
; CHECK-NEXT:    movw %si, -{{[0-9]+}}(%rsp)
; CHECK-NEXT:    movb %dil, -{{[0-9]+}}(%rsp)
; CHECK-NEXT:    ldtilecfg -{{[0-9]+}}(%rsp)
; CHECK-NEXT:    movl $buf, %eax
; CHECK-NEXT:    movl $32, %ecx
; CHECK-NEXT:    movw $8, %dx
; CHECK-NEXT:    tileloadd (%rax,%rcx), %tmm0
; CHECK-NEXT:    movl $buf+1024, %eax
; CHECK-NEXT:    tileloadd (%rax,%rcx), %tmm1
; CHECK-NEXT:    movl $buf+2048, %eax
; CHECK-NEXT:    tileloadd (%rax,%rcx), %tmm2
; CHECK-NEXT:    tdpbssd %tmm1, %tmm0, %tmm2
; CHECK-NEXT:    tilestored %tmm2, (%rax,%rcx)
; CHECK-NEXT:    tilerelease
; CHECK-NEXT:    vzeroupper
; CHECK-NEXT:    jmp foo # TAILCALL
  %3 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %0, i16 8, ptr @buf, i64 32)
  %4 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 8, i16 %1, ptr getelementptr inbounds ([3072 x i8], ptr @buf, i64 0, i64 1024), i64 32)
  %5 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %0, i16 %1, ptr getelementptr inbounds ([3072 x i8], ptr @buf, i64 0, i64 2048), i64 32)
  %6 = tail call x86_amx @llvm.x86.tdpbssd.internal(i16 %0, i16 %1, i16 8, x86_amx %5, x86_amx %3, x86_amx %4)
  call void @llvm.dbg.value(metadata x86_amx %6, metadata !DILocalVariable(name: "1", scope: !2), metadata !DIExpression()), !dbg !3
  tail call void @llvm.x86.tilestored64.internal(i16 %0, i16 %1, ptr getelementptr inbounds ([3072 x i8], ptr @buf, i64 0, i64 2048), i64 32, x86_amx %6)
  tail call void @foo()
  ret void
}

define dso_local void @test2(i16 signext %0, i16 signext %1) nounwind {
; CHECK-LABEL: test2:
; CHECK:       # %bb.0:
; CHECK-NEXT:    pushq %rbp
; CHECK-NEXT:    pushq %rbx
; CHECK-NEXT:    subq $72, %rsp
; CHECK-NEXT:    movl %esi, %ebx
; CHECK-NEXT:    movl %edi, %ebp
; CHECK-NEXT:    vxorps %xmm0, %xmm0, %xmm0
; CHECK-NEXT:    vmovups %zmm0, {{[0-9]+}}(%rsp)
; CHECK-NEXT:    movb $1, {{[0-9]+}}(%rsp)
; CHECK-NEXT:    movw $8, {{[0-9]+}}(%rsp)
; CHECK-NEXT:    movb $8, {{[0-9]+}}(%rsp)
; CHECK-NEXT:    movw $8, {{[0-9]+}}(%rsp)
; CHECK-NEXT:    movb $8, {{[0-9]+}}(%rsp)
; CHECK-NEXT:    movw %bx, {{[0-9]+}}(%rsp)
; CHECK-NEXT:    movb %bpl, {{[0-9]+}}(%rsp)
; CHECK-NEXT:    movw %bx, {{[0-9]+}}(%rsp)
; CHECK-NEXT:    movb %bpl, {{[0-9]+}}(%rsp)
; CHECK-NEXT:    movw %bx, {{[0-9]+}}(%rsp)
; CHECK-NEXT:    movb %bpl, {{[0-9]+}}(%rsp)
; CHECK-NEXT:    vzeroupper
; CHECK-NEXT:    callq foo
; CHECK-NEXT:    ldtilecfg {{[0-9]+}}(%rsp)
; CHECK-NEXT:    xorl %eax, %eax
; CHECK-NEXT:    testb %al, %al
; CHECK-NEXT:    jne .LBB1_3
; CHECK-NEXT:  # %bb.1: # %if.true
; CHECK-NEXT:    movw $8, %ax
; CHECK-NEXT:    tilezero %tmm0
; CHECK-NEXT:    movl $32, %ecx
; CHECK-NEXT:    movl $buf+1024, %edx
; CHECK-NEXT:    tileloadd (%rdx,%rcx), %tmm1
; CHECK-NEXT:    movl $buf+2048, %edx
; CHECK-NEXT:    tileloadd (%rdx,%rcx), %tmm2
; CHECK-NEXT:    tdpbssd %tmm2, %tmm1, %tmm0
; CHECK-NEXT:    tilestored %tmm0, (%rdx,%rcx)
; CHECK-NEXT:    jmp .LBB1_2
; CHECK-NEXT:  .LBB1_3: # %if.false
; CHECK-NEXT:    movl $buf, %eax
; CHECK-NEXT:    movl $32, %ecx
; CHECK-NEXT:    movw $8, %dx
; CHECK-NEXT:    tileloadd (%rax,%rcx), %tmm3
; CHECK-NEXT:    movl $buf+1024, %eax
; CHECK-NEXT:    tileloadd (%rax,%rcx), %tmm4
; CHECK-NEXT:    movl $buf+2048, %eax
; CHECK-NEXT:    tileloadd (%rax,%rcx), %tmm2
; CHECK-NEXT:    tdpbssd %tmm2, %tmm4, %tmm3
; CHECK-NEXT:    tilestored %tmm3, (%rax,%rcx)
; CHECK-NEXT:  .LBB1_2: # %if.true
; CHECK-NEXT:    addq $72, %rsp
; CHECK-NEXT:    popq %rbx
; CHECK-NEXT:    popq %rbp
; CHECK-NEXT:    tilerelease
; CHECK-NEXT:    retq
  call void @foo()
  br i1 undef, label %if.true, label %if.false

if.true:
  %t1 = tail call x86_amx @llvm.x86.tilezero.internal(i16 %0, i16 8)
  %t2 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 8, i16 %1, ptr getelementptr inbounds ([3072 x i8], ptr @buf, i64 0, i64 1024), i64 32)
  %t3 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %0, i16 %1, ptr getelementptr inbounds ([3072 x i8], ptr @buf, i64 0, i64 2048), i64 32)
  %t4 = tail call x86_amx @llvm.x86.tdpbssd.internal(i16 %0, i16 %1, i16 8, x86_amx %t1, x86_amx %t2, x86_amx %t3)
  tail call void @llvm.x86.tilestored64.internal(i16 %0, i16 %1, ptr getelementptr inbounds ([3072 x i8], ptr @buf, i64 0, i64 2048), i64 32, x86_amx %t4)
  br label %exit

if.false:
  %t5 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %0, i16 8, ptr @buf, i64 32)
  %t6 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 8, i16 %1, ptr getelementptr inbounds ([3072 x i8], ptr @buf, i64 0, i64 1024), i64 32)
  %t7 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %0, i16 %1, ptr getelementptr inbounds ([3072 x i8], ptr @buf, i64 0, i64 2048), i64 32)
  %t8 = tail call x86_amx @llvm.x86.tdpbssd.internal(i16 %0, i16 %1, i16 8, x86_amx %t5, x86_amx %t6, x86_amx %t7)
  tail call void @llvm.x86.tilestored64.internal(i16 %0, i16 %1, ptr getelementptr inbounds ([3072 x i8], ptr @buf, i64 0, i64 2048), i64 32, x86_amx %t8)
  br label %exit

exit:
  ret void
}

define dso_local void @test3(i16 signext %0, i16 signext %1) nounwind {
; CHECK-LABEL: test3:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vxorps %xmm0, %xmm0, %xmm0
; CHECK-NEXT:    vmovups %zmm0, -{{[0-9]+}}(%rsp)
; CHECK-NEXT:    movb $1, -{{[0-9]+}}(%rsp)
; CHECK-NEXT:    movw %si, -{{[0-9]+}}(%rsp)
; CHECK-NEXT:    movb %dil, -{{[0-9]+}}(%rsp)
; CHECK-NEXT:    xorl %eax, %eax
; CHECK-NEXT:    testb %al, %al
; CHECK-NEXT:    jne .LBB2_2
; CHECK-NEXT:  # %bb.1: # %if.true
; CHECK-NEXT:    incl %edi
; CHECK-NEXT:    jmp .LBB2_3
; CHECK-NEXT:  .LBB2_2: # %if.false
; CHECK-NEXT:    decl %edi
; CHECK-NEXT:  .LBB2_3: # %exit
; CHECK-NEXT:    movb %dil, -{{[0-9]+}}(%rsp)
; CHECK-NEXT:    ldtilecfg -{{[0-9]+}}(%rsp)
; CHECK-NEXT:    tilezero %tmm0
; CHECK-NEXT:    movl $buf, %eax
; CHECK-NEXT:    movl $32, %ecx
; CHECK-NEXT:    tilestored %tmm0, (%rax,%rcx)
; CHECK-NEXT:    tilerelease
; CHECK-NEXT:    vzeroupper
; CHECK-NEXT:    retq
  br i1 undef, label %if.true, label %if.false

if.true:
  %3 = add i16 %0, 1
  br label %exit

if.false:
  %4 = sub i16 %0, 1
  br label %exit

exit:
  %5 = phi i16 [ %3, %if.true ], [ %4, %if.false ]
  %6 = tail call x86_amx @llvm.x86.tilezero.internal(i16 %5, i16 %1)
  tail call void @llvm.x86.tilestored64.internal(i16 %5, i16 %1, ptr @buf, i64 32, x86_amx %6)
  ret void
}

define dso_local void @test4(i16 signext %0, i16 signext %1) nounwind {
; CHECK-LABEL: test4:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vxorps %xmm0, %xmm0, %xmm0
; CHECK-NEXT:    vmovups %zmm0, -{{[0-9]+}}(%rsp)
; CHECK-NEXT:    movb $1, -{{[0-9]+}}(%rsp)
; CHECK-NEXT:    movw %si, -{{[0-9]+}}(%rsp)
; CHECK-NEXT:    movb %dil, -{{[0-9]+}}(%rsp)
; CHECK-NEXT:    xorl %eax, %eax
; CHECK-NEXT:    testb %al, %al
; CHECK-NEXT:    jne .LBB3_3
; CHECK-NEXT:  # %bb.1: # %if.true
; CHECK-NEXT:    incl %edi
; CHECK-NEXT:    movb %dil, -{{[0-9]+}}(%rsp)
; CHECK-NEXT:    ldtilecfg -{{[0-9]+}}(%rsp)
; CHECK-NEXT:    xorl %eax, %eax
; CHECK-NEXT:    testb %al, %al
; CHECK-NEXT:    jne .LBB3_4
; CHECK-NEXT:  .LBB3_2: # %amx2
; CHECK-NEXT:    movl $32, %eax
; CHECK-NEXT:    movl $buf+1024, %ecx
; CHECK-NEXT:    tileloadd (%rcx,%rax), %tmm0
; CHECK-NEXT:    movl $buf, %ecx
; CHECK-NEXT:    tilestored %tmm0, (%rcx,%rax)
; CHECK-NEXT:    tilerelease
; CHECK-NEXT:    vzeroupper
; CHECK-NEXT:    retq
; CHECK-NEXT:  .LBB3_3: # %if.false
; CHECK-NEXT:    decl %edi
; CHECK-NEXT:    movb %dil, -{{[0-9]+}}(%rsp)
; CHECK-NEXT:    ldtilecfg -{{[0-9]+}}(%rsp)
; CHECK-NEXT:    xorl %eax, %eax
; CHECK-NEXT:    testb %al, %al
; CHECK-NEXT:    jne .LBB3_2
; CHECK-NEXT:  .LBB3_4: # %amx1
; CHECK-NEXT:    tilezero %tmm0
; CHECK-NEXT:    movl $buf, %eax
; CHECK-NEXT:    movl $32, %ecx
; CHECK-NEXT:    tilestored %tmm0, (%rax,%rcx)
; CHECK-NEXT:    tilerelease
; CHECK-NEXT:    vzeroupper
; CHECK-NEXT:    retq
  br i1 undef, label %if.true, label %if.false

if.true:
  %3 = add i16 %0, 1
  br i1 undef, label %amx1, label %amx2

if.false:
  %4 = sub i16 %0, 1
  br i1 undef, label %amx2, label %amx1

amx1:
  %5 = phi i16 [ %3, %if.true ], [ %4, %if.false ]
  %6 = tail call x86_amx @llvm.x86.tilezero.internal(i16 %5, i16 %1)
  tail call void @llvm.x86.tilestored64.internal(i16 %5, i16 %1, ptr @buf, i64 32, x86_amx %6)
  br label %exit

amx2:
  %7 = phi i16 [ %3, %if.true ], [ %4, %if.false ]
  %8 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %7, i16 %1, ptr getelementptr inbounds ([3072 x i8], ptr @buf, i64 0, i64 1024), i64 32)
  tail call void @llvm.x86.tilestored64.internal(i16 %7, i16 %1, ptr @buf, i64 32, x86_amx %8)
  br label %exit

exit:
  ret void
}

define dso_local void @test5(i16 signext %0, i16 signext %1) nounwind {
; CHECK-LABEL: test5:
; CHECK:       # %bb.0: # %entry
; CHECK-NEXT:    # kill: def $esi killed $esi def $rsi
; CHECK-NEXT:    vxorps %xmm0, %xmm0, %xmm0
; CHECK-NEXT:    vmovups %zmm0, -{{[0-9]+}}(%rsp)
; CHECK-NEXT:    movb $1, -{{[0-9]+}}(%rsp)
; CHECK-NEXT:    movw %si, -{{[0-9]+}}(%rsp)
; CHECK-NEXT:    movb %dil, -{{[0-9]+}}(%rsp)
; CHECK-NEXT:    xorl %eax, %eax
; CHECK-NEXT:    movl $buf, %ecx
; CHECK-NEXT:    movl $32, %edx
; CHECK-NEXT:    leal -1(%rsi), %r8d
; CHECK-NEXT:    jmp .LBB4_1
; CHECK-NEXT:    .p2align 4, 0x90
; CHECK-NEXT:  .LBB4_3: # %if.false
; CHECK-NEXT:    # in Loop: Header=BB4_1 Depth=1
; CHECK-NEXT:    movl %r8d, %esi
; CHECK-NEXT:    movw %r8w, -{{[0-9]+}}(%rsp)
; CHECK-NEXT:    cmpw $7, %si
; CHECK-NEXT:    jne .LBB4_5
; CHECK-NEXT:  .LBB4_1: # %loop.bb1
; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
; CHECK-NEXT:    ldtilecfg -{{[0-9]+}}(%rsp)
; CHECK-NEXT:    testb %al, %al
; CHECK-NEXT:    jne .LBB4_3
; CHECK-NEXT:  # %bb.2: # %if.true
; CHECK-NEXT:    # in Loop: Header=BB4_1 Depth=1
; CHECK-NEXT:    tilezero %tmm0
; CHECK-NEXT:    tilestored %tmm0, (%rcx,%rdx)
; CHECK-NEXT:    cmpw $7, %si
; CHECK-NEXT:    je .LBB4_1
; CHECK-NEXT:  .LBB4_5: # %exit
; CHECK-NEXT:    tilerelease
; CHECK-NEXT:    vzeroupper
; CHECK-NEXT:    retq
entry:
  br label %loop.bb1

loop.bb1:
  %2 = phi i16 [ %1, %entry ], [ %5, %loop.bb2 ]
  br i1 undef, label %if.true, label %if.false

if.true:
  %3 = tail call x86_amx @llvm.x86.tilezero.internal(i16 %0, i16 %2)
  tail call void @llvm.x86.tilestored64.internal(i16 %0, i16 %2, ptr @buf, i64 32, x86_amx %3)
  br label %loop.bb2

if.false:
  %4 = sub i16 %1, 1
  br label %loop.bb2

loop.bb2:
  %5 = phi i16 [ %2, %if.true ], [ %4, %if.false ]
  %6 = icmp eq i16 %5, 7
  br i1 %6, label %loop.bb1, label %exit

exit:
  ret void
}

define dso_local void @test6(i16 signext %0) nounwind {
; CHECK-LABEL: test6:
; CHECK:       # %bb.0: # %entry
; CHECK-NEXT:    # kill: def $edi killed $edi def $rdi
; CHECK-NEXT:    vxorps %xmm0, %xmm0, %xmm0
; CHECK-NEXT:    vmovups %zmm0, -{{[0-9]+}}(%rsp)
; CHECK-NEXT:    movb $1, -{{[0-9]+}}(%rsp)
; CHECK-NEXT:    movb %dil, -{{[0-9]+}}(%rsp)
; CHECK-NEXT:    xorl %eax, %eax
; CHECK-NEXT:    movl $buf, %ecx
; CHECK-NEXT:    movl $32, %edx
; CHECK-NEXT:    xorl %esi, %esi
; CHECK-NEXT:    jmp .LBB5_1
; CHECK-NEXT:    .p2align 4, 0x90
; CHECK-NEXT:  .LBB5_3: # %if.false
; CHECK-NEXT:    # in Loop: Header=BB5_1 Depth=1
; CHECK-NEXT:    decl %esi
; CHECK-NEXT:  .LBB5_4: # %loop.bb2
; CHECK-NEXT:    # in Loop: Header=BB5_1 Depth=1
; CHECK-NEXT:    leal (%rdi,%rsi), %r8d
; CHECK-NEXT:    movw %r8w, -{{[0-9]+}}(%rsp)
; CHECK-NEXT:    cmpw $7, %si
; CHECK-NEXT:    ldtilecfg -{{[0-9]+}}(%rsp)
; CHECK-NEXT:    tilezero %tmm0
; CHECK-NEXT:    tilestored %tmm0, (%rcx,%rdx)
; CHECK-NEXT:    jne .LBB5_5
; CHECK-NEXT:  .LBB5_1: # %loop.bb1
; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
; CHECK-NEXT:    testb %al, %al
; CHECK-NEXT:    jne .LBB5_3
; CHECK-NEXT:  # %bb.2: # %if.true
; CHECK-NEXT:    # in Loop: Header=BB5_1 Depth=1
; CHECK-NEXT:    incl %esi
; CHECK-NEXT:    jmp .LBB5_4
; CHECK-NEXT:  .LBB5_5: # %exit
; CHECK-NEXT:    tilerelease
; CHECK-NEXT:    vzeroupper
; CHECK-NEXT:    retq
entry:
  br label %loop.bb1

loop.bb1:
  %1 = phi i16 [ 0, %entry ], [ %4, %loop.bb2 ]
  br i1 undef, label %if.true, label %if.false

if.true:
  %2 = add i16 %1, 1
  br label %loop.bb2

if.false:
  %3 = sub i16 %1, 1
  br label %loop.bb2

loop.bb2:
  %4 = phi i16 [ %2, %if.true ], [ %3, %if.false ]
  %5 = icmp eq i16 %4, 7
  %6 = add i16 %0, %4
  %7 = tail call x86_amx @llvm.x86.tilezero.internal(i16 %0, i16 %6)
  tail call void @llvm.x86.tilestored64.internal(i16 %0, i16 %6, ptr @buf, i64 32, x86_amx %7)
  br i1 %5, label %loop.bb1, label %exit

exit:
  ret void
}


declare dso_local void @foo() nounwind
declare void @llvm.dbg.value(metadata, metadata, metadata)
declare x86_amx @llvm.x86.tilezero.internal(i16, i16)
declare x86_amx @llvm.x86.tileloadd64.internal(i16, i16, ptr, i64)
declare x86_amx @llvm.x86.tdpbssd.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx)
declare void @llvm.x86.tilestored64.internal(i16, i16, ptr, i64, x86_amx)

!llvm.dbg.cu = !{!0}
!llvm.module.flags = !{!1}

!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !DIFile(filename: "1", directory: "1"))
!1 = !{i32 2, !"Debug Info Version", i32 3}
!2 = distinct !DISubprogram(unit: !0)
!3 = !DILocation(line: 1, column: 1, scope: !2)
